Analiza pokazała że

Podsumowanie

Analiza zbioru danych pozwoliła na powiązanie rozmiaru poławianych śledzi z atrybutami ze zbioru danych. Znaczna część atrybutóW zawiera silne korelacje pomiędzy sobą. Na podstawie wykorzystanych w badaniu danych można wywnioskować, że realny wpływ na długość poławianych śledzi mają między innym: 1. temperatura przy powierzchni wody [°C]; 2. dostępność planktonu [zagęszczenie Calanus finmarchicus gat. 2]; 3. oscylacja północnoatlantycka [mb].

Used libraries

library(corrplot)
## corrplot 0.84 loaded
library(DAAG)
## Loading required package: lattice
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Set seed to make experiment results recurrent

set.seed(5)

Read data from file

filename <- "sledzie.csv"
df <- read.csv(filename, na.strings=c("?"))
print("Is data.frame instance?")
## [1] "Is data.frame instance?"
print(is.data.frame(df))
## [1] TRUE

Parse data (replace Na with a column mean value)

df <- df[, names(df) != 'X'] # Remove column X
df <- data.frame(
   sapply( df,
       function(x)ifelse(is.na(x), mean(x, na.rm=TRUE), x)
   )
)

Data set summary

print("Size of cleared data:")
## [1] "Size of cleared data:"
print(nrow(df))
## [1] 52582
print(summary(df))
##      length         cfin1             cfin2             chel1       
##  Min.   :19.0   Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:24.0   1st Qu.: 0.0000   1st Qu.: 0.2778   1st Qu.: 2.469  
##  Median :25.5   Median : 0.1333   Median : 0.7012   Median : 6.083  
##  Mean   :25.3   Mean   : 0.4458   Mean   : 2.0248   Mean   :10.006  
##  3rd Qu.:26.5   3rd Qu.: 0.3603   3rd Qu.: 1.9973   3rd Qu.:11.500  
##  Max.   :32.5   Max.   :37.6667   Max.   :19.3958   Max.   :75.000  
##      chel2            lcop1              lcop2             fbar       
##  Min.   : 5.238   Min.   :  0.3074   Min.   : 7.849   Min.   :0.0680  
##  1st Qu.:13.589   1st Qu.:  2.5479   1st Qu.:17.808   1st Qu.:0.2270  
##  Median :21.435   Median :  7.1229   Median :25.338   Median :0.3320  
##  Mean   :21.221   Mean   : 12.8108   Mean   :28.419   Mean   :0.3304  
##  3rd Qu.:27.193   3rd Qu.: 21.2315   3rd Qu.:37.232   3rd Qu.:0.4560  
##  Max.   :57.706   Max.   :115.5833   Max.   :68.736   Max.   :0.8490  
##       recr              cumf             totaln             sst       
##  Min.   : 140515   Min.   :0.06833   Min.   : 144137   Min.   :12.77  
##  1st Qu.: 360061   1st Qu.:0.14809   1st Qu.: 306068   1st Qu.:13.63  
##  Median : 421391   Median :0.23191   Median : 539558   Median :13.86  
##  Mean   : 520367   Mean   :0.22981   Mean   : 514973   Mean   :13.87  
##  3rd Qu.: 724151   3rd Qu.:0.29803   3rd Qu.: 730351   3rd Qu.:14.16  
##  Max.   :1565890   Max.   :0.39801   Max.   :1015595   Max.   :14.73  
##       sal            xmonth            nao          
##  Min.   :35.40   Min.   : 1.000   Min.   :-4.89000  
##  1st Qu.:35.51   1st Qu.: 5.000   1st Qu.:-1.89000  
##  Median :35.51   Median : 8.000   Median : 0.20000  
##  Mean   :35.51   Mean   : 7.258   Mean   :-0.09236  
##  3rd Qu.:35.52   3rd Qu.: 9.000   3rd Qu.: 1.63000  
##  Max.   :35.61   Max.   :12.000   Max.   : 5.08000

Show attribute distributions

for(name in names(df)){
  d <- density(df[, name])
  plot(d, main=name)
}

Show correlation matrix

corr_matrix <- cor(df)
corrplot(corr_matrix, method="pie", col=c("black", "white"), bg="lightblue")

Herring length in time

number_to_aggregate <- 100
aggregated <- aggregate(df
                        , list(rep(1:(nrow(df)%/%number_to_aggregate+1), each=number_to_aggregate, len=nrow(df)))
                        , mean)[-1]

aggregated$timeline <- as.numeric(row.names(aggregated))
p <- ggplot(aggregated, aes(timeline, length, size=sst)) + geom_point() + theme_bw()
plotly::ggplotly(p)
p_sst <- ggplot(aggregated, aes(timeline, sst, size=length)) + geom_point() + theme_bw()
plotly::ggplotly(p_sst)

Regression model

# sst has the biggest correlation with length
# cfin2 has big correlation with length and relatively small with sst
# nao has correlation with length but does not have big correlation with sst or cfin2
# There is no need in adding more attributes to regression as the rest of attributes are very much correlated
formula <- length ~ sst + cfin2 + nao
linearMod <- lm(formula, data=df)  # build linear regression model on full data
summary(linearMod)
## 
## Call:
## lm(formula = formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3744 -0.9972  0.0312  1.0028  6.7078 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.8584067  0.2511569 190.552   <2e-16 ***
## sst         -1.6256714  0.0180124 -90.253   <2e-16 ***
## cfin2       -0.0007262  0.0018283  -0.397    0.691    
## nao         -0.0331445  0.0033535  -9.884   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.479 on 52578 degrees of freedom
## Multiple R-squared:  0.1996, Adjusted R-squared:  0.1996 
## F-statistic:  4371 on 3 and 52578 DF,  p-value: < 2.2e-16
cvResults <- suppressWarnings(CVlm(df, form.lm=formula, m=5, dots=FALSE, seed=29, legend.pos="topleft",  printit=FALSE, main="Small symbols are predicted values while bigger ones are actuals."));  # performs the CV

attr(cvResults, 'ms')
## [1] 2.186963